WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/igalk/OneDrive/Documentos/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependencies 㤼㸱bitops㤼㸲, 㤼㸱coda㤼㸲, 㤼㸱gtools㤼㸲, 㤼㸱caTools㤼㸲, 㤼㸱statnet.common㤼㸲, 㤼㸱TSP㤼㸲, 㤼㸱qap㤼㸲, 㤼㸱gclus㤼㸲, 㤼㸱gplots㤼㸲, 㤼㸱registry㤼㸲, 㤼㸱network㤼㸲, 㤼㸱sna㤼㸲, 㤼㸱shape㤼㸲, 㤼㸱seriation㤼㸲, 㤼㸱igraph㤼㸲, 㤼㸱ggnetwork㤼㸲, 㤼㸱plotly㤼㸲, 㤼㸱visNetwork㤼㸲, 㤼㸱discretization㤼㸲, 㤼㸱glmnet㤼㸲, 㤼㸱pmml㤼㸲, 㤼㸱XML㤼㸲, 㤼㸱arulesViz㤼㸲, 㤼㸱arulesCBA㤼㸲
There are binary versions available but the source versions are later:
Binaries will be installed
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/bitops_1.0-7.zip'
Content type 'application/zip' length 42425 bytes (41 KB)
downloaded 41 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/coda_0.19-4.zip'
Content type 'application/zip' length 322755 bytes (315 KB)
downloaded 315 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/gtools_3.9.2.zip'
Content type 'application/zip' length 366894 bytes (358 KB)
downloaded 358 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/caTools_1.18.2.zip'
Content type 'application/zip' length 316400 bytes (308 KB)
downloaded 308 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/statnet.common_4.5.0.zip'
Content type 'application/zip' length 246982 bytes (241 KB)
downloaded 241 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/TSP_1.1-10.zip'
Content type 'application/zip' length 1026352 bytes (1002 KB)
downloaded 1002 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/qap_0.1-1.zip'
Content type 'application/zip' length 537689 bytes (525 KB)
downloaded 525 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/gclus_1.3.2.zip'
Content type 'application/zip' length 416500 bytes (406 KB)
downloaded 406 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/gplots_3.1.1.zip'
Content type 'application/zip' length 603126 bytes (588 KB)
downloaded 588 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/registry_0.5-1.zip'
Content type 'application/zip' length 197181 bytes (192 KB)
downloaded 192 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/network_1.17.1.zip'
Content type 'application/zip' length 852583 bytes (832 KB)
downloaded 832 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/sna_2.6.zip'
Content type 'application/zip' length 1316382 bytes (1.3 MB)
downloaded 1.3 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/shape_1.4.6.zip'
Content type 'application/zip' length 788976 bytes (770 KB)
downloaded 770 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/seriation_1.2-9.zip'
Content type 'application/zip' length 1232636 bytes (1.2 MB)
downloaded 1.2 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/igraph_1.2.6.zip'
Content type 'application/zip' length 9346426 bytes (8.9 MB)
downloaded 8.9 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/plotly_4.9.4.1.zip'
Content type 'application/zip' length 3031022 bytes (2.9 MB)
downloaded 2.9 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/visNetwork_2.0.9.zip'
Content type 'application/zip' length 4595190 bytes (4.4 MB)
downloaded 4.4 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/discretization_1.0-1.zip'
Content type 'application/zip' length 100041 bytes (97 KB)
downloaded 97 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/glmnet_4.1-1.zip'
Content type 'application/zip' length 2255757 bytes (2.2 MB)
downloaded 2.2 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/pmml_2.4.0.zip'
Content type 'application/zip' length 664041 bytes (648 KB)
downloaded 648 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/XML_3.99-0.6.zip'
Content type 'application/zip' length 4260833 bytes (4.1 MB)
downloaded 4.1 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/arulesViz_1.5-0.zip'
Content type 'application/zip' length 1758920 bytes (1.7 MB)
downloaded 1.7 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/arulesCBA_1.2.0.zip'
Content type 'application/zip' length 225149 bytes (219 KB)
downloaded 219 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/arules_1.6-8.zip'
Content type 'application/zip' length 2575551 bytes (2.5 MB)
downloaded 2.5 MB
package ‘bitops’ successfully unpacked and MD5 sums checked
package ‘coda’ successfully unpacked and MD5 sums checked
package ‘gtools’ successfully unpacked and MD5 sums checked
package ‘caTools’ successfully unpacked and MD5 sums checked
package ‘statnet.common’ successfully unpacked and MD5 sums checked
package ‘TSP’ successfully unpacked and MD5 sums checked
package ‘qap’ successfully unpacked and MD5 sums checked
package ‘gclus’ successfully unpacked and MD5 sums checked
package ‘gplots’ successfully unpacked and MD5 sums checked
package ‘registry’ successfully unpacked and MD5 sums checked
package ‘network’ successfully unpacked and MD5 sums checked
package ‘sna’ successfully unpacked and MD5 sums checked
package ‘shape’ successfully unpacked and MD5 sums checked
package ‘seriation’ successfully unpacked and MD5 sums checked
package ‘igraph’ successfully unpacked and MD5 sums checked
package ‘plotly’ successfully unpacked and MD5 sums checked
package ‘visNetwork’ successfully unpacked and MD5 sums checked
package ‘discretization’ successfully unpacked and MD5 sums checked
package ‘glmnet’ successfully unpacked and MD5 sums checked
package ‘pmml’ successfully unpacked and MD5 sums checked
package ‘XML’ successfully unpacked and MD5 sums checked
package ‘arulesViz’ successfully unpacked and MD5 sums checked
package ‘arulesCBA’ successfully unpacked and MD5 sums checked
package ‘arules’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\igalk\AppData\Local\Temp\RtmpMjsq8N\downloaded_packages
installing the source package 㤼㸱ggnetwork㤼㸲
trying URL 'https://cran.rstudio.com/src/contrib/ggnetwork_0.5.9.tar.gz'
Content type 'application/x-gzip' length 2092646 bytes (2.0 MB)
downloaded 2.0 MB
* installing *source* package 'ggnetwork' ...
** package 'ggnetwork' successfully unpacked and MD5 sums checked
** using staged installation
** R
** inst
** byte-compile and prepare package for lazy loading
Warning message:
package 'ggplot2' was built under R version 4.0.5
** help
*** installing help indices
converting help for package 'ggnetwork'
finding HTML links ... done
format_fortify html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/format_fortify.Rd:69: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/format_fortify.Rd:24: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
fortify html
fortify.igraph html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/fortify.igraph.Rd:5: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/fortify.igraph.Rd:56: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/fortify.igraph.Rd:19: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/fortify.igraph.Rd:24: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
fortify.network html
geom_edges html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/geom_edges.Rd:84: file link 'geom_curve' in package 'ggplot2' does not exist and so has been treated as a topic
geom_edgetext html
finding level-2 HTML links ... done
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/geom_edgetext.Rd:104: file link 'geom_label' in package 'ggplot2' does not exist and so has been treated as a topic
geom_edgetext_repel html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/geom_edgetext_repel.Rd:121: file link 'geom_label_repel' in package 'ggrepel' does not exist and so has been treated as a topic
geom_nodes html
geom_nodetext html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/geom_nodetext.Rd:106: file link 'geom_label' in package 'ggplot2' does not exist and so has been treated as a topic
geom_nodetext_repel html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/geom_nodetext_repel.Rd:118: file link 'geom_label_repel' in package 'ggrepel' does not exist and so has been treated as a topic
ggnetwork html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/ggnetwork.Rd:11: file link 'igraph-package' in package 'igraph' does not exist and so has been treated as a topic
scale_safely html
theme_blank html
theme_facet html
unit html
Rd warning: C:/Users/igalk/AppData/Local/Temp/Rtmp4eUPfq/R.INSTALL6770bf94d2a/ggnetwork/man/unit.Rd:7: file link 'unit' in package 'ggplot2' does not exist and so has been treated as a topic
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
*** arch - i386
Warning: package 'ggplot2' was built under R version 4.0.5
*** arch - x64
Warning: package 'ggplot2' was built under R version 4.0.5
** testing if installed package can be loaded from final location
*** arch - i386
Warning: package 'ggplot2' was built under R version 4.0.5
*** arch - x64
Warning: package 'ggplot2' was built under R version 4.0.5
** testing if installed package keeps a record of temporary installation path
* DONE (ggnetwork)
The downloaded source packages are in
‘C:\Users\igalk\AppData\Local\Temp\RtmpMjsq8N\downloaded_packages’
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following objects are masked from 㤼㸱package:tidyr㤼㸲:
expand, pack, unpack
Attaching package: 㤼㸱arules㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
recode
The following objects are masked from 㤼㸱package:base㤼㸲:
abbreviate, write
df_lyrics
Error: object 'df_lyrics' not found
df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
Error in df_audio_features_raw %>% group_by(track_name, external_urls_spotify) %>% :
could not find function "%>%"
cant_marketsdf_charts <- df_charts_raw %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise(semanas_sum = n(),
streams_sum = (sum(Streams, na.rm = T)/10^6 ),
streams_min = (min(Streams)/10^6 ),
streams_max = (max(Streams)/10^6 ),
position_avg = mean(Position, na.rm = T),
position_min = min(Position),
position_max = max(Position)) %>%
ungroup() %>%
mutate(indicador = as.numeric(streams_sum*semanas_sum/position_avg) )
`summarise()` has grouped output by 'Artist', 'Track_Name'. You can override using the `.groups` argument.
audio_features Y charts#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all","artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
right_join( df_charts,# %>%
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
md.pattern(join_audio_charts, rotate.names = TRUE)
popularidad[is.na(popularidad$indicador),]
#Agregación de todas las semanas en charts
features_continuas <- c('acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'cant_markets')
features_categoricas <- c('explicit', 'key_name', 'mode_name', "key_mode", "album_type")
groupping_cols <- c("artist_name","artist_all","artist_key","track_name","external_urls_spotify","album_name","album_release_year")
numeric_col_charts <- c("Position","Streams")
week_start <- c("week_start")
chart_group <- join_audio_charts %>%
group_by(artist_name,artist_all,artist_key,track_name,external_urls_spotify,album_name,album_release_year)
continuas_summarized = chart_group %>% summarise_at(features_continuas, mean, na.rm = TRUE)
categoricas_summarizes = chart_group %>% summarise_at(features_categoricas, first)
numeric_charts_summarizes = chart_group %>% summarise(across(numeric_col_charts, list(min=min,max=max,avg=mean)))
Note: Using an external vector in selections is ambiguous.
i Use `all_of(numeric_col_charts)` instead of `numeric_col_charts` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
Error: Problem with `summarise()` input `..1`.
x Can't subset columns that don't exist.
x Columns `Position` and `Streams` don't exist.
i Input `..1` is `(function (.cols = everything(), .fns = NULL, ..., .names = NULL) ...`.
i The error occurred in group 1: artist_name = "*NSYNC", artist_all = "*NSYNC", artist_key = "*NSYNC", track_name = "Merry Christmas, Happy Holidays", external_urls_spotify = "https://open.spotify.com/track/15coTBAzEN1bOeipoNDZAR", album_name = "Home For Christmas", album_release_year = 1998.
Run `rlang::last_error()` to see where the error occurred.
inspect(head(sort(rules.sub, by = "lift", decreasing = TRUE),10))
lhs rhs support confidence coverage lift count
[1] {instrumentalness=0.00e+00,
album_type=album,
nivel_puteada=exp_alto} => {nivel_ranking=chart_100a200} 0.1285047 0.7432432 0.1728972 1.178178 55
[2] {instrumentalness=0.00e+00,
explicit=TRUE,
album_type=album,
nivel_puteada=exp_alto} => {nivel_ranking=chart_100a200} 0.1285047 0.7432432 0.1728972 1.178178 55
[3] {instrumentalness=0.00e+00,
album_type=album,
nivel_puteada=exp_alto,
nivel_popularidad=pop_bajo} => {nivel_ranking=chart_100a200} 0.1285047 0.7432432 0.1728972 1.178178 55
[4] {instrumentalness=0.00e+00,
explicit=TRUE,
album_type=album,
nivel_puteada=exp_alto,
nivel_popularidad=pop_bajo} => {nivel_ranking=chart_100a200} 0.1285047 0.7432432 0.1728972 1.178178 55
[5] {album_type=album,
nivel_puteada=exp_alto} => {nivel_ranking=chart_100a200} 0.1728972 0.7254902 0.2383178 1.150036 74
[6] {explicit=TRUE,
album_type=album,
nivel_puteada=exp_alto} => {nivel_ranking=chart_100a200} 0.1728972 0.7254902 0.2383178 1.150036 74
[7] {album_type=album,
nivel_puteada=exp_alto,
nivel_popularidad=pop_bajo} => {nivel_ranking=chart_100a200} 0.1728972 0.7254902 0.2383178 1.150036 74
[8] {explicit=TRUE,
album_type=album,
nivel_puteada=exp_alto,
nivel_popularidad=pop_bajo} => {nivel_ranking=chart_100a200} 0.1728972 0.7254902 0.2383178 1.150036 74
[9] {nivel_puteada=exp_bajo} => {nivel_ranking=chart_1a100} 0.1658879 0.4226190 0.3925234 1.144816 71
[10] {explicit=TRUE,
nivel_puteada=exp_bajo} => {nivel_ranking=chart_1a100} 0.1658879 0.4226190 0.3925234 1.144816 71
##histograma de las variables continuas de audio_features
for (i in features_continuas){
hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)
}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
##histograma de las variables continuas de charts
for (i in c(features_continuas, "Streams")){
hist(join_audio_charts[,i], main = paste("Histograma de", i, "(charts)"), xlab = i)
abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")
}
#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')
audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")
##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){
barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(all data)"))
# pie(table(df_features_categoricos[,i]))
}
##medidas resumen y barplots de las variables categoricas join_audio_charts
for(i in features_categoricas){
barplot(sort(table(join_audio_charts[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(charts)"))
# pie(table(df_features_categoricos[,i]))
}
markets_concat#Hago un join al revés
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","markets_concat")]
join_barplot <- df_audio_features_tojoin %>%
select("track_name","artist_key","external_urls_spotify","markets_concat") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_barplot$isinchart[is.na(join_barplot$isinchart)] <- 0
join_barplot$isinchart <- factor(join_barplot$isinchart)
tabla_isinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==1,"markets_concat"], function(x) strsplit(as.character(x), ','))))
tabla_notinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==0,"markets_concat"], function(x) strsplit(as.character(x), ','))))
all_countries <- names(tabla_isinchart)
xlabs <- paste(paste(head(all_countries,3), collapse = ","),"...",paste(tail(all_countries,3),collapse = ","),"(ISO-Codes de Paises)", collapse = ",")
options(scipen=999)
par(mfrow = c(1,2), las=1, mar=c(3,3,5,3), oma=c(0,1,1,1))
barplot(sort(tabla_isinchart, decreasing = TRUE), names.arg="", main ='En Charts',col=rgb(0.2,0.4,0.6,0.6),xlab = "Paises (ISO-Codes)")
# mtext(side = 1, text = xlabs, line = 1)
barplot(sort(tabla_notinchart, decreasing = TRUE), names.arg = "", main='Fuera de Charts',col=rgb(0.2,0.4,0.6,0.6), xlab = "Paises (ISO-Codes)")
mtext(side = 1, text = xlabs, line = 1, adj = 2)
mtext("Frecuencia de mercados habilitados", side = 3, line = -1, outer = TRUE, cex = 1.3, font =2 )
# mtext("Paises (ISO-Codes)", side = 3, line = -25, outer = TRUE)
#correlaciones en audio features
x <- cor(df_audio_features[,c(features_continuas_media, features_continuas_mediana)], use = "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de audio_features", mar=c(0,0,1,0), method="number" ,number.cex=0.7)
#correlaciones en charts
x <- cor(scale(join_audio_charts[,c(audio_charts_continuas_media, audio_charts_continuas_mediana)]), use = "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de los Charts", mar=c(0,0,1,0), method="number", number.cex=0.7 )
#chi2 test #con n grande no se puede usar este test
tabla_key_album <- table(df_audio_features$key_name, df_audio_features$album_type)
cat("Tabla de contigencia entre key y album type\n")
tabla_key_album
chisq.test(tabla_key_album)
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
all_features <- c(features_continuas_media, features_continuas_mediana)
par(mfrow=c(4,3))
for (feature in all_features){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"
#sesgos d las variables
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))
df_sesgadas <- df_audio_features[,variables_sesgo]
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
names(df_sesgadas_log_adjust) <- names(df_sesgadas)
df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)
a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round(sort(apply(merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)})),2)
variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot,paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]
par(mfrow = c(1,2))
for (col in names(plotear)){
hist(plotear[,col], breaks="FD", main=col, xlab="")
}
summary(df_audio_features[,all_features])
hist(log(df_audio_features$duration_ms))
transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
logaritmo_ajustado = function(x,delta){
if (x<=0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+delta))
}else{
return(1/sqrt(x))
}
}
delta <- 10^(-6)
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
summary(fit)
instrumentalness tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(x+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")
¿Es útil esta transformación?
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]
join_histogram <- df_audio_features_tojoin %>%
dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0
join_histogram$isinchart <- factor(join_histogram$isinchart)
h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <- h11$counts/sum(h11$counts)*100
h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <- h12$counts/sum(h12$counts)*100
h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <- h21$counts/sum(h21$counts)*100
h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <- h22$counts/sum(h22$counts)*100
#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
# width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Instrumentalnessinstrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)
#funcion para escalar variable
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
#anti_join
anti_join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all", "artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
anti_join( df_charts %>%
select( "Track_Name", "Artist", "URL"),
by = c("external_urls_spotify" ="URL",
"artist_key" ="Artist" ))
# by = c("track_name" = "Track_Name"))
anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>%
distinct() %>%
select(features_continuas) %>%
mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)
join_audio_charts %>%
group_by(artist_name) %>%
dplyr::summarise(n = n()) %>%
arrange(-n)
join_audio_charts %>%
group_by(track_name, artist_name,external_urls_spotify) %>%
dplyr::summarise(n = n()) %>%
arrange(-n) %>%
select(track_name, n, everything(.))
# cantidad de semanas que estuvieron en el chart
df_charts %>%
mutate(week_start=as.Date(week_start),
week_end = as.Date(week_end),
week_year = (year(week_start))) %>%
arrange(Artist, Track_Name) %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise( day_in = min(week_start),
year_in = year(day_in),
day_max = max(week_end),
year_max = year(day_max),
duracion_chart_dias = day_max-day_in,
duracion_chart_anio = year_max - year_in) %>%
arrange(Artist)
#prueba igal de transformacion y test de normalidad
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
[1] 1.85241e-21
[1] 1.167246e-23
[1] 7.44622e-11
[1] 1.256851e-30
[1] NaN
[1] 3.287572e-17
NaNs producedError in shapiro.test(x) : sample size must be between 3 and 5000